ph: pH of 1. water (0 to 14).
Hardness: Capacity of water to precipitate soap in mg/L.
Solids: Total dissolved solids in ppm.
Chloramines: Amount of Chloramines in ppm.
Sulfate: Amount of Sulfates dissolved in mg/L.
Conductivity: Electrical conductivity of water in μS/cm.
Organic_carbon: Amount of organic carbon in ppm.
Trihalomethanes: Amount of Trihalomethanes in μg/L.
Turbidity: Measure of light emiting property of water in NTU.
Potability: Indicates if water is safe for human consumption. Potable - 1 and Not potable - 0
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import plotly
import plotly.offline as pyo
import plotly.express as px
import plotly.graph_objs as go
pyo.init_notebook_mode()
import plotly.figure_factory as ff
from collections import Counter
import os
for dirname, _, filenames in os.walk('/Users/always/Documents/大三下/Applied Multivariate Statistical Analysis/DATA'):
for filename in filenames:
print(os.path.join(dirname, filename))
/Users/always/Documents/大三下/Applied Multivariate Statistical Analysis/DATA/.Rhistory /Users/always/Documents/大三下/Applied Multivariate Statistical Analysis/DATA/T8-6.DAT /Users/always/Documents/大三下/Applied Multivariate Statistical Analysis/DATA/T8-4.DAT /Users/always/Documents/大三下/Applied Multivariate Statistical Analysis/DATA/T8-5.DAT /Users/always/Documents/大三下/Applied Multivariate Statistical Analysis/DATA/.DS_Store /Users/always/Documents/大三下/Applied Multivariate Statistical Analysis/DATA/T1-11.dat /Users/always/Documents/大三下/Applied Multivariate Statistical Analysis/DATA/T11-4-1.DAT /Users/always/Documents/大三下/Applied Multivariate Statistical Analysis/DATA/T5-12.dat /Users/always/Documents/大三下/Applied Multivariate Statistical Analysis/DATA/T9-12.DAT /Users/always/Documents/大三下/Applied Multivariate Statistical Analysis/DATA/T1-9.dat /Users/always/Documents/大三下/Applied Multivariate Statistical Analysis/DATA/T5-1.DAT /Users/always/Documents/大三下/Applied Multivariate Statistical Analysis/DATA/T1-2.DAT /Users/always/Documents/大三下/Applied Multivariate Statistical Analysis/DATA/.RData /Users/always/Documents/大三下/Applied Multivariate Statistical Analysis/DATA/T1-3.dat /Users/always/Documents/大三下/Applied Multivariate Statistical Analysis/DATA/T6-1.dat /Users/always/Documents/大三下/Applied Multivariate Statistical Analysis/DATA/T11-7.DAT /Users/always/Documents/大三下/Applied Multivariate Statistical Analysis/DATA/T11-4.DAT /Users/always/Documents/大三下/Applied Multivariate Statistical Analysis/DATA/T4-1.DAT /Users/always/Documents/大三下/Applied Multivariate Statistical Analysis/DATA/water_potability.csv /Users/always/Documents/大三下/Applied Multivariate Statistical Analysis/DATA/T4-5.DAT /Users/always/Documents/大三下/Applied Multivariate Statistical Analysis/DATA/T4-6.DAT
colors_blue = ["#132C33", "#264D58", '#17869E', '#51C4D3', '#B4DBE9']
colors_dark = ["#1F1F1F", "#313131", '#636363', '#AEAEAE', '#DADADA']
colors_green = ['#01411C','#4B6F44','#4F7942','#74C365','#D0F0C0']
sns.palplot(colors_blue)
sns.palplot(colors_green)
sns.palplot(colors_dark)
wq_df = pd.read_csv('/Users/always/Documents/大三下/Applied Multivariate Statistical Analysis/DATA/water_potability.csv')
wq_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3276 entries, 0 to 3275 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ph 2785 non-null float64 1 Hardness 3276 non-null float64 2 Solids 3276 non-null float64 3 Chloramines 3276 non-null float64 4 Sulfate 2495 non-null float64 5 Conductivity 3276 non-null float64 6 Organic_carbon 3276 non-null float64 7 Trihalomethanes 3114 non-null float64 8 Turbidity 3276 non-null float64 9 Potability 3276 non-null int64 dtypes: float64(9), int64(1) memory usage: 256.1 KB
绘制目前数据集的分布图
fig = px.histogram(wq_df,x='Hardness',y=Counter(wq_df['Hardness']),color='Potability',template='plotly_white',
marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
barmode='group',histfunc='count')
fig.add_vline(x=151, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)
fig.add_vline(x=301, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)
fig.add_vline(x=76, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)
fig.add_annotation(text='<76 mg/L is<br> considered soft',x=40,y=130,showarrow=False,font_size=9)
fig.add_annotation(text='Between 76 and 150<br> (mg/L) is<br>moderately hard',x=113,y=130,showarrow=False,font_size=9)
fig.add_annotation(text='Between 151 and 300 (mg/L)<br> is considered hard',x=250,y=130,showarrow=False,font_size=9)
fig.add_annotation(text='>300 mg/L is<br> considered very hard',x=340,y=130,showarrow=False,font_size=9)
fig.update_layout(
font_family='monospace',
title=dict(text='Hardness Distribution',x=0.53,y=0.95,
font=dict(color=colors_dark[2],size=20)),
xaxis_title_text='Hardness (mg/L)',
yaxis_title_text='Count',
legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
bargap=0.3,
)
fig.show()
fig = px.histogram(wq_df,x='ph',y=Counter(wq_df['ph']),color='Potability',template='plotly_white',
marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
barmode='group',histfunc='count')
fig.add_vline(x=7, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)
fig.add_annotation(text='<7 is Acidic',x=4,y=70,showarrow=False,font_size=10)
fig.add_annotation(text='>7 is Basic',x=10,y=70,showarrow=False,font_size=10)
fig.update_layout(
font_family='monospace',
title=dict(text='ph Distribution',x=0.5,y=0.95,
font=dict(color=colors_dark[2],size=20)),
xaxis_title_text='pH Level',
yaxis_title_text='Count',
legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
bargap=0.3,
)
fig.show()
fig = px.histogram(wq_df,x='Solids',y=Counter(wq_df['Solids']),color='Potability',template='plotly_white',
marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
barmode='group',histfunc='count')
fig.update_layout(
font_family='monospace',
title=dict(text='Solids Distribution',x=0.5,y=0.95,
font=dict(color=colors_dark[2],size=20)),
xaxis_title_text='Dissolved Solids (ppm)',
yaxis_title_text='Count',
legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
bargap=0.3,
)
fig.show()
fig = px.histogram(wq_df,x='Chloramines',y=Counter(wq_df['Chloramines']),color='Potability',template='plotly_white',
marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
barmode='group',histfunc='count')
fig.add_vline(x=4, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)
fig.add_annotation(text='<4 ppm is considered<br> safe for drinking',x=1.8,y=90,showarrow=False)
fig.update_layout(
font_family='monospace',
title=dict(text='Chloramines Distribution',x=0.53,y=0.95,
font=dict(color=colors_dark[2],size=20)),
xaxis_title_text='Chloramines (ppm)',
yaxis_title_text='Count',
legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
bargap=0.3,
)
fig.show()
fig = px.histogram(wq_df,x='Sulfate',y=Counter(wq_df['Sulfate']),color='Potability',template='plotly_white',
marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
barmode='group',histfunc='count')
fig.add_vline(x=250, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)
fig.add_annotation(text='<250 mg/L is considered<br> safe for drinking',x=175,y=90,showarrow=False)
fig.update_layout(
font_family='monospace',
title=dict(text='Sulfate Distribution',x=0.53,y=0.95,
font=dict(color=colors_dark[2],size=20)),
xaxis_title_text='Sulfate (mg/L)',
yaxis_title_text='Count',
legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
bargap=0.3,
)
fig.show()
fig = px.histogram(wq_df,x='Conductivity',y=Counter(wq_df['Conductivity']),color='Potability',template='plotly_white',
marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
barmode='group',histfunc='count')
fig.add_annotation(text='The Conductivity range <br> is safe for both (200-800),<br> Potable and Non-Potable water',
x=600,y=90,showarrow=False)
fig.update_layout(
font_family='monospace',
title=dict(text='Conductivity Distribution',x=0.5,y=0.95,
font=dict(color=colors_dark[2],size=20)),
xaxis_title_text='Conductivity (μS/cm)',
yaxis_title_text='Count',
legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
bargap=0.3,
)
fig.show()
fig = px.histogram(wq_df,x='Organic_carbon',y=Counter(wq_df['Organic_carbon']),color='Potability',template='plotly_white',
marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
barmode='group',histfunc='count')
fig.add_vline(x=10, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)
fig.add_annotation(text='Typical Organic Carbon<br> level is upto 10 ppm',x=5.3,y=110,showarrow=False)
fig.update_layout(
font_family='monospace',
title=dict(text='Organic Carbon Distribution',x=0.5,y=0.95,
font=dict(color=colors_dark[2],size=20)),
xaxis_title_text='Organic Carbon (ppm)',
yaxis_title_text='Count',
legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
bargap=0.3,
)
fig.show()
fig = px.histogram(wq_df,x='Trihalomethanes',y=Counter(wq_df['Trihalomethanes']),color='Potability',template='plotly_white',
marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
barmode='group',histfunc='count')
fig.add_vline(x=80, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)
fig.add_annotation(text='Upper limit of Trihalomethanes<br> level is 80 μg/L',x=115,y=90,showarrow=False)
fig.update_layout(
font_family='monospace',
title=dict(text='Trihalomethanes Distribution',x=0.5,y=0.95,
font=dict(color=colors_dark[2],size=20)),
xaxis_title_text='Trihalomethanes (μg/L)',
yaxis_title_text='Count',
legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
bargap=0.3,
)
fig.show()
fig = px.histogram(wq_df,x='Turbidity',y=Counter(wq_df['Turbidity']),color='Potability',template='plotly_white',
marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
barmode='group',histfunc='count')
fig.add_vline(x=5, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)
fig.add_annotation(text='<5 NTU Turbidity is<br> considered safe',x=6,y=90,showarrow=False)
fig.update_layout(
font_family='monospace',
title=dict(text='Turbidity Distribution',x=0.5,y=0.95,
font=dict(color=colors_dark[2],size=20)),
xaxis_title_text='Turbidity (NTU)',
yaxis_title_text='Count',
legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
bargap=0.3,
)
fig.show()
检查缺失值及处理
msno.matrix(wq_df)
<AxesSubplot:>
从上图可以看到变量PH,Sulfate,Trihalomethanes均存在缺失值
wq_df.isnull().sum()
ph 491 Hardness 0 Solids 0 Chloramines 0 Sulfate 781 Conductivity 0 Organic_carbon 0 Trihalomethanes 162 Turbidity 0 Potability 0 dtype: int64
由于水质对微量元素的变化十分敏感,所以不能用均值或者中位数等方法直接进行填充缺失值。最保险的做法是直接删除含有缺失值的数据,删除之后我们仍有十分大的样本来处理数据。
wq_df = wq_df.dropna()
wq_df.Potability.value_counts()
0 1200 1 811 Name: Potability, dtype: int64
离群值处理
首先绘制处理完缺失值后的分布图
fig = px.histogram(wq_df,x='Hardness',y=Counter(wq_df['Hardness']),color='Potability',template='plotly_white',
marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
barmode='group',histfunc='count')
fig.add_vline(x=151, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)
fig.add_vline(x=301, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)
fig.add_vline(x=76, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)
fig.add_annotation(text='<76 mg/L is<br> considered soft',x=40,y=130,showarrow=False,font_size=9)
fig.add_annotation(text='Between 76 and 150<br> (mg/L) is<br>moderately hard',x=113,y=130,showarrow=False,font_size=9)
fig.add_annotation(text='Between 151 and 300 (mg/L)<br> is considered hard',x=250,y=130,showarrow=False,font_size=9)
fig.add_annotation(text='>300 mg/L is<br> considered very hard',x=340,y=130,showarrow=False,font_size=9)
fig.update_layout(
font_family='monospace',
title=dict(text='Hardness Distribution',x=0.53,y=0.95,
font=dict(color=colors_dark[2],size=20)),
xaxis_title_text='Hardness (mg/L)',
yaxis_title_text='Count',
legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
bargap=0.3,
)
fig.show()
fig = px.histogram(wq_df,x='ph',y=Counter(wq_df['ph']),color='Potability',template='plotly_white',
marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
barmode='group',histfunc='count')
fig.add_vline(x=7, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)
fig.add_annotation(text='<7 is Acidic',x=4,y=70,showarrow=False,font_size=10)
fig.add_annotation(text='>7 is Basic',x=10,y=70,showarrow=False,font_size=10)
fig.update_layout(
font_family='monospace',
title=dict(text='ph Distribution',x=0.5,y=0.95,
font=dict(color=colors_dark[2],size=20)),
xaxis_title_text='pH Level',
yaxis_title_text='Count',
legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
bargap=0.3,
)
fig.show()
fig = px.histogram(wq_df,x='Solids',y=Counter(wq_df['Solids']),color='Potability',template='plotly_white',
marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
barmode='group',histfunc='count')
fig.update_layout(
font_family='monospace',
title=dict(text='Solids Distribution',x=0.5,y=0.95,
font=dict(color=colors_dark[2],size=20)),
xaxis_title_text='Dissolved Solids (ppm)',
yaxis_title_text='Count',
legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
bargap=0.3,
)
fig.show()
fig = px.histogram(wq_df,x='Chloramines',y=Counter(wq_df['Chloramines']),color='Potability',template='plotly_white',
marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
barmode='group',histfunc='count')
fig.add_vline(x=4, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)
fig.add_annotation(text='<4 ppm is considered<br> safe for drinking',x=1.8,y=90,showarrow=False)
fig.update_layout(
font_family='monospace',
title=dict(text='Chloramines Distribution',x=0.53,y=0.95,
font=dict(color=colors_dark[2],size=20)),
xaxis_title_text='Chloramines (ppm)',
yaxis_title_text='Count',
legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
bargap=0.3,
)
fig.show()
fig = px.histogram(wq_df,x='Sulfate',y=Counter(wq_df['Sulfate']),color='Potability',template='plotly_white',
marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
barmode='group',histfunc='count')
fig.add_vline(x=250, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)
fig.add_annotation(text='<250 mg/L is considered<br> safe for drinking',x=175,y=90,showarrow=False)
fig.update_layout(
font_family='monospace',
title=dict(text='Sulfate Distribution',x=0.53,y=0.95,
font=dict(color=colors_dark[2],size=20)),
xaxis_title_text='Sulfate (mg/L)',
yaxis_title_text='Count',
legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
bargap=0.3,
)
fig.show()
fig = px.histogram(wq_df,x='Conductivity',y=Counter(wq_df['Conductivity']),color='Potability',template='plotly_white',
marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
barmode='group',histfunc='count')
fig.add_annotation(text='The Conductivity range <br> is safe for both (200-800),<br> Potable and Non-Potable water',
x=600,y=90,showarrow=False)
fig.update_layout(
font_family='monospace',
title=dict(text='Conductivity Distribution',x=0.5,y=0.95,
font=dict(color=colors_dark[2],size=20)),
xaxis_title_text='Conductivity (μS/cm)',
yaxis_title_text='Count',
legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
bargap=0.3,
)
fig.show()
fig = px.histogram(wq_df,x='Organic_carbon',y=Counter(wq_df['Organic_carbon']),color='Potability',template='plotly_white',
marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
barmode='group',histfunc='count')
fig.add_vline(x=10, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)
fig.add_annotation(text='Typical Organic Carbon<br> level is upto 10 ppm',x=5.3,y=110,showarrow=False)
fig.update_layout(
font_family='monospace',
title=dict(text='Organic Carbon Distribution',x=0.5,y=0.95,
font=dict(color=colors_dark[2],size=20)),
xaxis_title_text='Organic Carbon (ppm)',
yaxis_title_text='Count',
legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
bargap=0.3,
)
fig.show()
fig = px.histogram(wq_df,x='Trihalomethanes',y=Counter(wq_df['Trihalomethanes']),color='Potability',template='plotly_white',
marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
barmode='group',histfunc='count')
fig.add_vline(x=80, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)
fig.add_annotation(text='Upper limit of Trihalomethanes<br> level is 80 μg/L',x=115,y=90,showarrow=False)
fig.update_layout(
font_family='monospace',
title=dict(text='Trihalomethanes Distribution',x=0.5,y=0.95,
font=dict(color=colors_dark[2],size=20)),
xaxis_title_text='Trihalomethanes (μg/L)',
yaxis_title_text='Count',
legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
bargap=0.3,
)
fig.show()
fig = px.histogram(wq_df,x='Turbidity',y=Counter(wq_df['Turbidity']),color='Potability',template='plotly_white',
marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
barmode='group',histfunc='count')
fig.add_vline(x=5, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)
fig.add_annotation(text='<5 NTU Turbidity is<br> considered safe',x=6,y=90,showarrow=False)
fig.update_layout(
font_family='monospace',
title=dict(text='Turbidity Distribution',x=0.5,y=0.95,
font=dict(color=colors_dark[2],size=20)),
xaxis_title_text='Turbidity (NTU)',
yaxis_title_text='Count',
legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
bargap=0.3,
)
fig.show()
可以看到存在很多离群值,接下来对离群值进行处理,将小于0.01分位数和大于0.99分位数的数据删除
def outlier_removal(x):
upper = x.quantile(.99)
lower = x.quantile(.01)
x = np.where(x > upper, upper, x)
x = np.where(x < lower, lower, x)
return x
wq_df = wq_df.apply(lambda x : outlier_removal(x))
i=1
plt.figure(figsize = (10,15))
for j in wq_df.columns:
plt.subplot(4, 3, i)
sns.boxplot(data = wq_df, x = wq_df[j])
i += 1
可以看到离群值已基本处理完毕
对数据进行标准化处理
对于ph值,为保留酸碱的信息,先减去7,然后除以最大值减去最小值,以此得到(-1,1)区间的数据,并用正负来区分酸碱
wq_df_not_sd = wq_df.copy(deep = True)
wq_df['ph'] = wq_df['ph'] - 7
wq_df['ph'] = wq_df['ph'] / (max(wq_df['ph']) - min(wq_df['ph']))
对于其他变量采用z-score标准化处理
zscore = preprocessing.StandardScaler()
wq_df[['Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']] = zscore.fit_transform(
wq_df[['Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']])
wq_df
| ph | Hardness | Solids | Chloramines | Sulfate | Conductivity | Organic_carbon | Trihalomethanes | Turbidity | Potability | |
|---|---|---|---|---|---|---|---|---|---|---|
| 3 | 0.178496 | 0.573204 | 0.014034 | 0.599880 | 0.589691 | -0.794575 | 1.247236 | 2.157924 | 0.861605 | 0.0 |
| 4 | 0.283614 | -0.466173 | -0.463095 | -0.378244 | -0.580160 | -0.351871 | -0.854478 | -2.188812 | 0.138225 | 0.0 |
| 5 | -0.191936 | -0.240884 | 0.809001 | 0.267231 | -0.166209 | -1.837569 | -1.819601 | -0.731082 | -1.841541 | 0.0 |
| 6 | 0.437015 | 1.625903 | 0.809123 | 0.246889 | 1.509981 | -1.797465 | -0.172647 | 1.156962 | -1.693545 | 0.0 |
| 7 | 0.221750 | 0.229205 | -0.971818 | -1.660823 | -0.750969 | 0.607962 | -0.608338 | -0.229877 | 0.564587 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3267 | 0.269744 | 0.594258 | -0.706132 | -0.539433 | -0.510213 | -0.452652 | -1.361451 | -0.721450 | 0.842102 | 1.0 |
| 3268 | -0.040322 | 0.352898 | -0.549566 | 0.372786 | -0.720929 | -1.222871 | 0.569132 | -2.386477 | -0.687578 | 1.0 |
| 3269 | 0.516602 | -2.565939 | 1.805934 | 1.378272 | -1.861484 | 0.170678 | 0.555520 | -1.580753 | 0.522570 | 1.0 |
| 3270 | -0.126120 | -0.292562 | 0.500724 | 0.398282 | 0.309783 | -0.131728 | -0.698844 | -0.381145 | -0.391367 | 1.0 |
| 3271 | -0.316104 | -0.073181 | 2.754201 | 0.022670 | 0.666323 | 1.260683 | -0.140648 | 0.017493 | 0.609524 | 1.0 |
2011 rows × 10 columns
数据可视化处理
首先是处理后数据的正例负例的比例情况,约为40%的正例以及60%的负例,这里可以对样本进行平衡处理,但样本本身的正负例比例或许传递了现实中正负例的先验信息,因此在这里将不采取平衡样本
d= pd.DataFrame(wq_df['Potability'].value_counts())
fig = px.pie(d,values='Potability',names=['Not Potable','Potable'],hole=0.4,opacity=0.6,
color_discrete_sequence=[colors_green[3],colors_blue[3]],
labels={'label':'Potability','Potability':'No. Of Samples'})
fig.add_annotation(text='Potability',
x=0.5,y=0.5,showarrow=False,font_size=14,opacity=0.7,font_family='monospace')
fig.update_layout(
font_family='monospace',
title=dict(text='How many samples of water are Potable?',x=0.47,y=0.98,
font=dict(color=colors_dark[2],size=20)),
legend=dict(x=0.37,y=-0.05,orientation='h',traceorder='reversed'),
hoverlabel=dict(bgcolor='white'))
fig.update_traces(textposition='outside', textinfo='percent+label')
fig.show()
绘制各变量的样本相关系数矩阵及两两之间的散布图
plt.figure(figsize = (15,12))
sns.heatmap(wq_df.corr(), annot = True, vmin = -1)
plt.show()
sns.pairplot(data = wq_df, hue = "Potability")
<seaborn.axisgrid.PairGrid at 0x7fed1429e880>
将各变量与饮用性的相关系数比较画出如下: 可以看到各变量与饮用性之间相关性十分低,且上文的相关系数图说明各变量之间的相关性也很低。
一方面说明实验样本的变量选择十分到位,避免了重复信息的收集,另一方面数据本身已基本满足变量间的不相关,因此后续将不展开主成分或主因子的分析
wq_df.corr()['Potability'].plot(kind = 'barh')
<AxesSubplot:>
绘制频数直方图如下:
plt.figure(figsize = (20,20))
for i in range(8):
plt.subplot(4, 2, (i%8)+1)
sns.histplot(wq_df[wq_df.columns[i]])
plt.title(wq_df.columns[i],fontdict={'size':20, 'weight':'bold'}, pad=3)
plt.show()
分出训练集、验证集和测试集(6:2:2)供后续分类学习使用:
X_train_not_sd, X_test_not_sd, y_train_not_sd, y_test_not_sd = train_test_split(
wq_df_not_sd.loc[:,['ph','Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']],
wq_df_not_sd['Potability'], random_state = 0)
Train_not_sd = X_train_not_sd
Train_not_sd['Potability'] = y_train_not_sd
Train_not_sd.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1508 entries, 968 to 1099 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ph 1508 non-null float64 1 Hardness 1508 non-null float64 2 Solids 1508 non-null float64 3 Chloramines 1508 non-null float64 4 Sulfate 1508 non-null float64 5 Conductivity 1508 non-null float64 6 Organic_carbon 1508 non-null float64 7 Trihalomethanes 1508 non-null float64 8 Turbidity 1508 non-null float64 9 Potability 1508 non-null float64 dtypes: float64(10) memory usage: 129.6 KB
Test_not_sd = X_test_not_sd
Test_not_sd['Potability'] = y_test_not_sd
Test_not_sd.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 503 entries, 367 to 1301 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ph 503 non-null float64 1 Hardness 503 non-null float64 2 Solids 503 non-null float64 3 Chloramines 503 non-null float64 4 Sulfate 503 non-null float64 5 Conductivity 503 non-null float64 6 Organic_carbon 503 non-null float64 7 Trihalomethanes 503 non-null float64 8 Turbidity 503 non-null float64 9 Potability 503 non-null float64 dtypes: float64(10) memory usage: 43.2 KB
Train_not_sd.Potability.value_counts().plot(kind ='pie')
<AxesSubplot:ylabel='Potability'>
Test_not_sd.Potability.value_counts().plot(kind ='pie')
<AxesSubplot:ylabel='Potability'>
X_temp, X_test, y_temp, y_test = train_test_split(
wq_df.loc[:,['ph','Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']],
wq_df['Potability'], random_state = 0, test_size = 0.2)
Test = X_test
Test['Potability'] = y_test
Test.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 403 entries, 367 to 2628 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ph 403 non-null float64 1 Hardness 403 non-null float64 2 Solids 403 non-null float64 3 Chloramines 403 non-null float64 4 Sulfate 403 non-null float64 5 Conductivity 403 non-null float64 6 Organic_carbon 403 non-null float64 7 Trihalomethanes 403 non-null float64 8 Turbidity 403 non-null float64 9 Potability 403 non-null float64 dtypes: float64(10) memory usage: 34.6 KB
Temp = X_temp
Temp['Potability'] = y_temp
Temp.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1608 entries, 1594 to 1099 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ph 1608 non-null float64 1 Hardness 1608 non-null float64 2 Solids 1608 non-null float64 3 Chloramines 1608 non-null float64 4 Sulfate 1608 non-null float64 5 Conductivity 1608 non-null float64 6 Organic_carbon 1608 non-null float64 7 Trihalomethanes 1608 non-null float64 8 Turbidity 1608 non-null float64 9 Potability 1608 non-null float64 dtypes: float64(10) memory usage: 138.2 KB
X_train, X_val, y_train, y_val = train_test_split(
Temp.loc[:,['ph','Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']],
Temp['Potability'], random_state = 0, test_size = 0.25)
Val = X_val
Val['Potability'] = y_val
Val.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 402 entries, 225 to 196 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ph 402 non-null float64 1 Hardness 402 non-null float64 2 Solids 402 non-null float64 3 Chloramines 402 non-null float64 4 Sulfate 402 non-null float64 5 Conductivity 402 non-null float64 6 Organic_carbon 402 non-null float64 7 Trihalomethanes 402 non-null float64 8 Turbidity 402 non-null float64 9 Potability 402 non-null float64 dtypes: float64(10) memory usage: 34.5 KB
Train = X_train
Train['Potability'] = y_train
Train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1206 entries, 2130 to 2338 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ph 1206 non-null float64 1 Hardness 1206 non-null float64 2 Solids 1206 non-null float64 3 Chloramines 1206 non-null float64 4 Sulfate 1206 non-null float64 5 Conductivity 1206 non-null float64 6 Organic_carbon 1206 non-null float64 7 Trihalomethanes 1206 non-null float64 8 Turbidity 1206 non-null float64 9 Potability 1206 non-null float64 dtypes: float64(10) memory usage: 103.6 KB
Val.Potability.value_counts().plot(kind ='pie')
<AxesSubplot:ylabel='Potability'>
输出清洗后的数据集
wq_df.to_csv('data of water quality after cleaning.csv', index = False, header = True)
wq_df_not_sd.to_csv('data of water quality after cleaning(未标准化).csv', index = False, header = True)
Train_not_sd.to_csv('分类训练集(未标准化).csv', index = False, header = True)
Test_not_sd.to_csv('分类测试集(未标准化).csv', index = False, header = True)
Train.to_csv('训练集(1206个).csv', index = False, header = True)
Val.to_csv('验证集(402个).csv', index = False, header = True)
Test.to_csv('测试集(403个).csv', index = False, header = True)